Reading the data¶
Lately, the .fits data was transformed into .csv and saved in database/csv_files. We are going to import a light curve sample and work with it.
[2]:
import pandas as pd
from datetime import datetime
!pip install control
from tools import *
Requirement already satisfied: control in c:\users\guisa\appdata\local\programs\python\python38\lib\site-packages (0.9.0)
Requirement already satisfied: matplotlib in c:\users\guisa\appdata\local\programs\python\python38\lib\site-packages (from control) (3.3.4)
Requirement already satisfied: scipy in c:\users\guisa\appdata\local\programs\python\python38\lib\site-packages (from control) (1.6.1)
Requirement already satisfied: numpy in c:\users\guisa\appdata\local\programs\python\python38\lib\site-packages (from control) (1.19.5)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\guisa\appdata\local\programs\python\python38\lib\site-packages (from matplotlib->control) (1.3.1)
Requirement already satisfied: cycler>=0.10 in c:\users\guisa\appdata\local\programs\python\python38\lib\site-packages (from matplotlib->control) (0.10.0)
Requirement already satisfied: python-dateutil>=2.1 in c:\users\guisa\appdata\local\programs\python\python38\lib\site-packages (from matplotlib->control) (2.8.1)
Requirement already satisfied: pillow>=6.2.0 in c:\users\guisa\appdata\local\programs\python\python38\lib\site-packages (from matplotlib->control) (8.1.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\users\guisa\appdata\local\programs\python\python38\lib\site-packages (from matplotlib->control) (2.4.7)
Requirement already satisfied: six in c:\users\guisa\appdata\roaming\python\python38\site-packages (from cycler>=0.10->matplotlib->control) (1.15.0)
WARNING: You are using pip version 21.1.1; however, version 21.1.3 is available.
You should consider upgrading via the 'c:\users\guisa\appdata\local\programs\python\python38\python.exe -m pip install --upgrade pip' command.
[3]:
# FILE_PATH = '/content/drive/MyDrive/01 - Iniciação Científica/02 - Datasets/csv_files/EN2_STAR_CHR_0101086161_20070516T060226_20071005T074409.csv'
FILE_PATH = 'C:/Users/guisa/Google Drive/01 - Iniciação Científica/02 - Datasets/csv_files/EN2_STAR_CHR_0101086161_20070516T060226_20071005T074409.csv'
[4]:
# https://docs.python.org/3/library/datetime.html#datetime.datetime.strptime
# Reading data
data_sample = pd.read_csv(FILE_PATH)
# data_sample.DATE = pd.to_datetime(data_sample.DATE)
# data.DATE: str -> datetime.datetime
for i in range(data_sample.DATE.size):
data_sample.DATE[i] = datetime.strptime(data_sample.DATE[i], '%Y-%m-%d %H:%M:%S.%f')
display(data_sample)
time = data_sample.DATE
flux = data_sample.WHITEFLUX
<ipython-input-4-2c25a0a32d87>:10: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
data_sample.DATE[i] = datetime.strptime(data_sample.DATE[i], '%Y-%m-%d %H:%M:%S.%f')
| DATE | WHITEFLUX | |
|---|---|---|
| 0 | 2007-05-16 18:10:55.071642 | 112626.77 |
| 1 | 2007-05-16 18:19:27.113766 | 112605.61 |
| 2 | 2007-05-16 18:27:59.155929 | 112771.50 |
| 3 | 2007-05-16 18:36:31.198092 | 113113.60 |
| 4 | 2007-05-16 18:45:03.240256 | 112621.79 |
| ... | ... | ... |
| 23946 | 2007-10-05 19:08:02.358665 | 112438.61 |
| 23947 | 2007-10-05 19:16:34.313684 | 112406.01 |
| 23948 | 2007-10-05 19:25:06.268742 | 112496.13 |
| 23949 | 2007-10-05 19:33:38.223801 | 112344.83 |
| 23950 | 2007-10-05 19:42:10.178859 | 112318.50 |
23951 rows × 2 columns
[5]:
viz.view_lightcurve(time, flux)
[5]:
# Median sample rate of the time series
sample_rate = time.diff().median()
sample_rate
[5]:
Timedelta('0 days 00:08:31.994769')
[6]:
# Sample size
time[time.size-1] - time[0]
[6]:
datetime.timedelta(days=142, seconds=5475, microseconds=107217)
[7]:
time[0]
[7]:
datetime.datetime(2007, 5, 16, 18, 10, 55, 71642)
[ ]:
[ ]:
[ ]:
Another approach¶
[8]:
data_sample = pd.read_csv(FILE_PATH)
data_sample.DATE = pd.to_datetime(data_sample.DATE)
date = data_sample.DATE
date[0]
flux = data_sample.WHITEFLUX.to_numpy()
[9]:
data_sample.head()
[9]:
| DATE | WHITEFLUX | |
|---|---|---|
| 0 | 2007-05-16 18:10:55.071642 | 112626.77 |
| 1 | 2007-05-16 18:19:27.113766 | 112605.61 |
| 2 | 2007-05-16 18:27:59.155929 | 112771.50 |
| 3 | 2007-05-16 18:36:31.198092 | 113113.60 |
| 4 | 2007-05-16 18:45:03.240256 | 112621.79 |
[82]:
# https://jakevdp.github.io/PythonDataScienceHandbook/03.11-working-with-time-series.html
index = pd.DatetimeIndex(date)
index.name = None
data_sample_two = pd.Series(flux, index=index)
data_sample_two.head()
[82]:
2007-05-16 18:10:55.071642 112626.77
2007-05-16 18:19:27.113766 112605.61
2007-05-16 18:27:59.155929 112771.50
2007-05-16 18:36:31.198092 113113.60
2007-05-16 18:45:03.240256 112621.79
dtype: float64
[118]:
data_sample_two['2007-05-19 01:22:07']
[118]:
2007-05-19 01:22:07.244849 111857.01
dtype: float64
[123]:
data_sample_two.mean()
[123]:
112501.16221318526
[125]:
list = []
for i in data_sample_two:
if i < data_sample_two.mean():
list.append(i)
######## PAREI Aqui
[115]:
import plotly.express as px
fig = px.line(x=time, y=flux, title='Raw Light Curve')
fig.show()
[ ]:
[ ]:
[ ]:
ORMaster¶
Reading time series
[6]:
df = pd.read_csv(FILE_PATH)
df.head()
[6]:
| DATE | WHITEFLUX | |
|---|---|---|
| 0 | 2007-05-16 18:10:55.071642 | 112626.77 |
| 1 | 2007-05-16 18:19:27.113766 | 112605.61 |
| 2 | 2007-05-16 18:27:59.155929 | 112771.50 |
| 3 | 2007-05-16 18:36:31.198092 | 113113.60 |
| 4 | 2007-05-16 18:45:03.240256 | 112621.79 |
[7]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23951 entries, 0 to 23950
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 DATE 23951 non-null object
1 WHITEFLUX 23951 non-null float64
dtypes: float64(1), object(1)
memory usage: 374.4+ KB
Column DATE: object => datetime64
[8]:
df.DATE = pd.to_datetime(df.DATE)
[9]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23951 entries, 0 to 23950
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 DATE 23951 non-null datetime64[ns]
1 WHITEFLUX 23951 non-null float64
dtypes: datetime64[ns](1), float64(1)
memory usage: 374.4 KB
Setting the column DATE as the DataFrame index
[11]:
df.set_index('DATE', inplace=True)
[12]:
df.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 23951 entries, 2007-05-16 18:10:55.071642 to 2007-10-05 19:42:10.178859
Data columns (total 1 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 WHITEFLUX 23951 non-null float64
dtypes: float64(1)
memory usage: 374.2 KB
Seasonality¶
[75]:
df.index
[75]:
DatetimeIndex(['2007-05-16 18:10:55.071642', '2007-05-16 18:19:27.113766',
'2007-05-16 18:27:59.155929', '2007-05-16 18:36:31.198092',
'2007-05-16 18:45:03.240256', '2007-05-16 18:53:35.282419',
'2007-05-16 19:02:07.324582', '2007-05-16 19:10:39.366705',
'2007-05-16 19:19:11.408828', '2007-05-16 19:27:43.450951',
...
'2007-10-05 18:25:22.583936', '2007-10-05 18:33:54.538874',
'2007-10-05 18:42:26.493771', '2007-10-05 18:50:58.448709',
'2007-10-05 18:59:30.403687', '2007-10-05 19:08:02.358665',
'2007-10-05 19:16:34.313684', '2007-10-05 19:25:06.268742',
'2007-10-05 19:33:38.223801', '2007-10-05 19:42:10.178859'],
dtype='datetime64[ns]', name='DATE', length=23951, freq=None)
[183]:
df.groupby(df.index.day).boxplot(subplots=False, figsize=(15, 6), rot=90)
plt.title('Boxplot by day')
plt.ylabel('Whiteflux')
plt.show()
# One eclipse a week
[ ]:
[ ]:
# https://www.quora.com/How-do-you-identify-seasonality-in-a-time-series-data
Phase Folded¶
[358]:
from astropy import timeseries
ts = timeseries.TimeSeries.from_pandas(df)
print(type(ts))
ts.pprint()
<class 'astropy.timeseries.sampled.TimeSeries'>
time WHITEFLUX
----------------------------- ----------
2007-05-16T18:10:55.071642000 112626.77
2007-05-16T18:19:27.113766000 112605.61
2007-05-16T18:27:59.155929000 112771.5
2007-05-16T18:36:31.198092000 113113.6
2007-05-16T18:45:03.240256000 112621.79
2007-05-16T18:53:35.282419000 112553.21
2007-05-16T19:02:07.324582000 112811.305
2007-05-16T19:10:39.366705000 112723.12
2007-05-16T19:19:11.408828000 112513.234
2007-05-16T19:27:43.450951000 112761.56
... ...
2007-10-05T18:16:50.628958000 112545.79
2007-10-05T18:25:22.583936000 112428.29
2007-10-05T18:33:54.538874000 112613.29
2007-10-05T18:42:26.493771000 112415.414
2007-10-05T18:50:58.448709000 112491.53
2007-10-05T18:59:30.403687000 112400.95
2007-10-05T19:08:02.358665000 112438.61
2007-10-05T19:16:34.313684000 112406.01
2007-10-05T19:25:06.268742000 112496.13
2007-10-05T19:33:38.223801000 112344.83
2007-10-05T19:42:10.178859000 112318.5
Length = 23951 rows
[356]:
plt.plot(ts.time.jd, ts['WHITEFLUX'], 'k.', markersize=1)
plt.xlabel('Time from midpoint epoch (days)')
plt.ylabel('SAP Flux (e-/s)')
plt.show()
[354]:
# https://docs.astropy.org/en/stable/timeseries/analysis.html
# https://docs.astropy.org/en/stable/api/astropy.timeseries.TimeSeries.html#astropy.timeseries.TimeSeries.fold
# https://docs.astropy.org/en/stable/units/#module-astropy.units
from astropy import units as u
ts_folded = ts.fold(period = 1 * u.day)
# AJUSTAR O PERIOD
plt.plot(ts_folded.time.jd, ts_folded['WHITEFLUX'], 'k.', markersize=1)
plt.xlabel('Time from midpoint epoch (days)')
plt.ylabel('SAP Flux (e-/s)')
plt.show()
[ ]:
[ ]:
[ ]: